***
* 6 Months Sample
***

**# Bookmark #: Main Outcomes
	
*** Load dataset
// First, re-run relevant regressions (we will use pooled sample)

	cd "$userpath\1_datasets"
	use 4_individual_ano_reg_1.dta, clear
		
	
	
	
**# Bookmark #1: Export relevant values (coefficients, p-values ) 
	global outcomes    aminwage wrcon  sosec empquality_ind lwage satisfied hoursworked training_any 
	di "Outcomes: $outcomes"
	
	local c=1
	foreach o of global outcomes {
		
		
	* Repeat ITT estimation to recover coefficients & p-values 
	areg `o' assignment `o'_2021 $controls_balance missing_`o'_2021 i.wave  $se_indiv
	
	// Save coefficients in matrix 
	matrix define main_b`c'=(r(table)[1,1])
	matrix rownames main_b`c'=`o'
	
	// save p-values in matrix 
	matrix define main_p`c' = (r(table)[4,1])
	matrix rownames main_p`c'=`o'
		
	local ++c 
			
	}
	



	
*** Matrix with p-values 
clear 

matrix define b_main=(main_b1 \  main_b2 \ main_b3 \ main_b4 \ main_b5 \ main_b6 \ main_b7 \ main_b8)
matrix define p_main= (main_p1 \ main_p2 \ main_p3 \ main_p4 \ main_p5 \ main_p6 \ main_p7  \ main_p8) 

matrix list b_main 
matrix list p_main // Note: in case of changes in list of outcome variables, check table labels 


*** Export p-values 
svmat float p_main

rename p_main1 pval 
	cd "$userpath\1_datasets"
save main_pvals.dta, replace 

gen line=_n 
gen outcomevar=""

di  "empquality sh_aminwage sh_wrcon sh_sosec"

replace outcomevar="Min. Wage (0/1)" 				if line==1 
replace outcomevar="Written Contract (0/1)" 					if line==2
replace outcomevar="Social Security (0/1)" 					if line==3
replace outcomevar="Formality Index (0-1)" 			if line==4 
replace outcomevar="Wage (Log)" 							if line==5
replace outcomevar="Satisfied (0/1)" 						if line==6
replace outcomevar="Hours worked" 					if line==7
replace outcomevar="Training Participation (0/1)"			if line==8

svmat float b_main 
rename b_main b 
order line outcomevar b pval 
 
	cd "$userpath\1_datasets"
save main_pvals_labels.dta, replace 

	
	
**# Bookmark #: Retention
	
*** Load dataset
// First, re-run relevant regressions (we will use pooled sample)

eststo clear
cd "$userpath/1_datasets"

* Load individual-level data
use 	4_individual_ano_vars.dta, clear
drop 	if id_indiv == .
by 		id_indiv: egen number = count(id_indiv)

* Merge retention info from firm-level data
merge 	m:1 id wave using 2_firm_regressions.dta, keepusing(empretention_2021 morethan1visit)
ren 	empretention_2021 empretention_raw_2021
replace empretention_raw_2021 = 0 if missing(empretention_raw_2021)
gen 	left_firm_raw_2021 = 1 - empretention_raw_2021
drop 	_merge

* Merge additional data for balancing
merge m:1 id_indiv using 4_data_for_balance.dta, ///
	keepusing(left_firm_22 used_w2_w1 firm_att left_firm_23 left_firm22 left_firm23 used_w3_w1 used_w2_w1)

* Standardize left firm variable
egen 	left_firm_2021 = std(left_firm_raw_2021)
replace left_firm_2021 = 0 if missing(left_firm_raw_2021)
gen 	missing_left_firm_2021 = missing(left_firm_raw_2021)

* Treatment assignment
ren 	assignment ITT

* ----------------------------------------------
* Construct analysis sample 
* ----------------------------------------------
gen 			emp_w1_temp = 1 if employer==1 & wave ==1
bysort 			id_indiv: egen emp_w1 = max(emp_w1_temp)

gen 			int_w1t 	= 1 if wave == 1
bys				id_indiv: 	egen int_w1 = max(int_w1t)
drop 			int_w1t
sort 			id_indiv wave
replace 		employer = employer[_n-1] if wave == 2 & status == 3 & id_indiv == id_indiv[_n-1] & int_w1 == 1
replace 		employer = employer[_n-1] if wave == 3 & status == 3 & id_indiv == id_indiv[_n-1] 

*Wave 2
gen 	left_firm_w2 = 1 if left_firm22 == 1 & wave == 1
replace left_firm_w2 = 0 if used_w2_w1 == 1 & wave == 1
bys 	id_indiv: egen temp = max(left_firm_w2)

replace left_firm_w2 = 0 if wave == 2 & status == 1 & employer == 0 & consent_individual == 1 & missing(temp)

gen 	temp2 = 1 if used_w3_w1 == 1 & wave == 3 & consent_individual == 1 & employer == 0 & ///
	status == 4 & used_w2_w1 == 0
replace left_firm_w2 = 0 if temp2 == 1
drop 	temp temp2

*Wave 3
	
gen 	left_firm_w3 = 1 if status == 3 & wave == 3 & employer==0
replace left_firm_w3 = 0 if status == 1 & wave == 3 & employer==0
replace left_firm_w3 = 0 if status == 4 & wave == 3 & employer==0
replace left_firm_w3 = . if wave == 3 & firm_att==1 & employer==0
replace left_firm_w3 = . if wave == 3 & inlist(id_indiv, 260175, 330633, 601383, 612011, 640326, 640328, 699933, 705895)

* ----------------------------------------------
* Outcome Variables: Quit and Layoff
* ----------------------------------------------

* ----------------------------------------------
* Quit Variables
* ----------------------------------------------

* Wave 2
gen     quit_w2_flag = 1 if left == 4 & wave == 2
bys     id_indiv: egen quit_w2_max = max(quit_w2_flag)
gen     quit_w2 = 1 if quit_w2_max == 1 & left_firm_w2 == 1
replace quit_w2 = 0 if left_firm_w2 == 0
replace quit_w2 =. if wave !=1
drop    quit_w2_flag quit_w2_max

* Wave 3
gen     quit_w3 = 1 if left == 4 & wave == 3
replace quit_w3 = 0 if left_firm_w3 == 0 & wave == 3
replace quit_w3 =. if wave !=3

* ----------------------------------------------
* Layoff Variables
* ----------------------------------------------

* Wave 2
gen     layoff_w2_flag = 1 if inlist(left, 1, 2) & wave == 2
bys     id_indiv: egen layoff_w2_max = max(layoff_w2_flag)
gen     layoff_w2 = 1 if layoff_w2_max == 1 & left_firm_w2 == 1
replace layoff_w2 = 0 if left_firm_w2 == 0
replace	layoff_w2 =. if wave !=1
drop    layoff_w2_flag layoff_w2_max

* Wave 3
gen     layoff_w3 = 1 if inlist(left, 1, 2) & wave == 3
replace layoff_w3 = 0 if left_firm_w3 == 0 & wave == 3
replace	layoff_w3 =. if wave !=3

global 	outcomes left_firm_w2 left_firm_w3 quit_w2 quit_w3 layoff_w2 layoff_w3 

	local c=1
	foreach o of global outcomes {
		
		
	* Repeat ITT estimation to recover coefficients & p-values 
	areg `o' ITT educ_tert_base i.ag_ano left_firm_2021 missing_left_firm_2021 $se_indiv 
	
	// Save coefficients in matrix 
	matrix define main_b`c'=(r(table)[1,1])
	matrix rownames main_b`c'=`o'
	
	// save p-values in matrix 
	matrix define main_p`c' = (r(table)[4,1])
	matrix rownames main_p`c'=`o'
		
	local ++c 
			
	}
	



	
*** Matrix with p-values 
clear 


matrix define b_main=(main_b1 \  main_b2 \ main_b3   )
matrix define p_main= (main_p1 \  main_p2 \ main_p3  ) 

matrix list b_main 
matrix list p_main // Note: in case of changes in list of outcome variables, check table labels 


*** Export p-values 
svmat float p_main

rename p_main1 pval 
	cd "$userpath\1_datasets"
save retention_pvals.dta, replace 

gen line=_n 
gen outcomevar=""

replace line = 9 if line == 1
replace line = 10 if line == 2
replace line = 11 if line == 3
replace outcomevar="Left Firm (0/1)"			 			if line==9 
replace outcomevar="Quit (0/1)"			 					if line==10 
replace outcomevar="Layoff (0/1)"			 				if line==11



svmat float b_main 
rename b_main b 
order line outcomevar b pval 
 
	cd "$userpath\1_datasets"
save retention_pvals_labels.dta, replace 
	
	

	use main_pvals_labels.dta, clear
	append using retention_pvals_labels.dta 
	*append using emp_pvals_labels.dta 
	save pvals_labels.dta, replace
	
	erase "main_pvals_labels.dta" 
	erase "retention_pvals_labels.dta"	
	*erase "emp_pvals_labels.dta"
	
**# Bookmark #3: Calculate q-values 


* This code generates BKY (2006) sharpened two-stage q-values as described in Anderson (2008), "Multiple Inference and Gender Differences in the Effects of Early Intervention: A Reevaluation of the Abecedarian, Perry Preschool, and Early Training Projects", Journal of the American Statistical Association, 103(484), 1481-1495

* BKY (2006) sharpened two-stage q-values are introduced in Benjamini, Krieger, and Yekutieli (2006), "Adaptive Linear Step-up Procedures that Control the False Discovery Rate", Biometrika, 93(3), 491-507

* Last modified: M. Anderson, 11/20/07
* Test Platform: Stata/MP 10.0 for Macintosh (Intel 32-bit), Mac OS X 10.5.1
* Should be compatible with Stata 10 or greater on all platforms
* Likely compatible with with Stata 9 or earlier on all platforms (remove "version 10" line below)

clear  
version 17

****  INSTRUCTIONS:
****    Please start with a clear data set
****	When prompted, paste the vector of p-values you are testing into the "pval" variable
****	Please use the "do" button rather than the "run" button to run this file (if you use "run", you will miss the instructions at the prompts)

pause on
set more off

display "***********************************"
display "Please paste the vector of p-values that you wish to test into the variable 'pval'"
display	"After pasting, type 'q' to resume"
display "***********************************"

pause

	cd "$userpath\1_datasets"
use main_pvals.dta, clear 
append using retention_pvals.dta 
*append using emp_pvals.dta

* Collect the total number of p-values tested

quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Set the initial counter to 1 

local qval = 1

* Generate the variable that will contain the BKY (2006) sharpened q-values

gen bky06_qval = 1 if pval~=.

* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, then checks which hypotheses are rejected at q = 0.999, then checks which hypotheses are rejected at q = 0.998, etc.  The loop ends by checking which hypotheses are rejected at q = 0.001.


while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	* Generate value q'*r/M
	gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q'*r/M
	gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	gen reject_rank1 = reject_temp1*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	* Generate value q_2st*r/M
	gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	gen reject_rank2 = reject_temp2*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}
	

	quietly sort original_sorting_order
	pause off
	set more on

	display "Code has completed."
	display "Benjamini Krieger Yekutieli (2006) sharpened q-vals are in variable 'bky06_qval'"
	display	"Sorting order is the same as the original vector of p-values"

	gen line=_n
	list line pval bky06_qval
	
	cd "$userpath\1_datasets"
	merge 1:1 line using pvals_labels.dta 

	list line outcomevar pval bky06_qval
	

	cd "$userpath\1_datasets"
	drop _merge 
	save pvals_all.dta, replace 

	
	
	
	
* Note: Sharpened FDR q-vals can be LESS than unadjusted p-vals when many hypotheses are rejected, because if you have many true rejections, then you can tolerate several false rejections too (this effectively just happens for p-vals that are so large that you are not going to reject them regardless).

	cd "$userpath\1_datasets"

	use pvals_all.dta, clear 
	order line outcomevar b pval bky06_qval
	drop original_sorting_order rank
	format b %9.3f
format pval %9.3f
format bky06_qval %9.3f
	cd "$userpath_writing/Tables"

	texsave  using "sharpened_q_values_6M.tex",  replace
	
	texsave line outcomevar b pval bky06_qval using "sharpened_q_values_6M.tex", ///
    title("Table B1: Sharpened q-values") ///
    headerlines("Estimate \#" "Outcome Variable" "Coefficient" "P-value" "Sharpened q-value") ///
    footnote("Note: Sharpened two-stage q-values are calculated as described in Anderson (2008) and introduced in Benjamini, Krieger, and Yekutieli (2006).") ///
    replace

	
****
* 18 months sample
****

	
**# Bookmark #: Main Outcomes
	
*** Load dataset
// First, re-run relevant regressions (we will use pooled sample)

	cd "$userpath\1_datasets"
	use 4_individual_ano_reg_2.dta, clear
		
	
	
	
**# Bookmark #1: Export relevant values (coefficients, p-values ) 
	global outcomes    aminwage wrcon  sosec empquality_ind lwage satisfied hoursworked training_any 
	di "Outcomes: $outcomes"
	
	local c=1
	foreach o of global outcomes {
		
		
	* Repeat ITT estimation to recover coefficients & p-values 
	areg `o' assignment `o'_2021 $controls_balance missing_`o'_2021 i.wave  $se_indiv
	
	// Save coefficients in matrix 
	matrix define main_b`c'=(r(table)[1,1])
	matrix rownames main_b`c'=`o'
	
	// save p-values in matrix 
	matrix define main_p`c' = (r(table)[4,1])
	matrix rownames main_p`c'=`o'
		
	local ++c 
			
	}
	



	
*** Matrix with p-values 
clear 

matrix define b_main=(main_b1 \  main_b2 \ main_b3 \ main_b4 \ main_b5 \ main_b6 \ main_b7 \ main_b8)
matrix define p_main= (main_p1 \ main_p2 \ main_p3 \ main_p4 \ main_p5 \ main_p6 \ main_p7  \ main_p8) 

matrix list b_main 
matrix list p_main // Note: in case of changes in list of outcome variables, check table labels 


*** Export p-values 
svmat float p_main

rename p_main1 pval 
	cd "$userpath\1_datasets"
save main_pvals.dta, replace 

gen line=_n 
gen outcomevar=""

di  "empquality sh_aminwage sh_wrcon sh_sosec"

replace outcomevar="Min. Wage (0/1)" 				if line==1 
replace outcomevar="Written Contract (0/1)" 					if line==2
replace outcomevar="Social Security (0/1)" 					if line==3
replace outcomevar="Formality Index (0-1)" 			if line==4 
replace outcomevar="Wage (Log)" 							if line==5
replace outcomevar="Satisfied (0/1)" 						if line==6
replace outcomevar="Hours worked" 					if line==7
replace outcomevar="Training Participation (0/1)"			if line==8

svmat float b_main 
rename b_main b 
order line outcomevar b pval 
 
	cd "$userpath\1_datasets"
save main_pvals_labels.dta, replace 

	
	
**# Bookmark #: Retention
	eststo clear
cd "$userpath/1_datasets"

* Load individual-level data
use 	4_individual_ano_vars.dta, clear
drop 	if id_indiv == .
by 		id_indiv: egen number = count(id_indiv)

* Merge retention info from firm-level data
merge 	m:1 id wave using 2_firm_regressions.dta, keepusing(empretention_2021 morethan1visit)
ren 	empretention_2021 empretention_raw_2021
replace empretention_raw_2021 = 0 if missing(empretention_raw_2021)
gen 	left_firm_raw_2021 = 1 - empretention_raw_2021
drop 	_merge

* Merge additional data for balancing
merge m:1 id_indiv using 4_data_for_balance.dta, ///
	keepusing(left_firm_22 used_w2_w1 firm_att left_firm_23 left_firm22 left_firm23 used_w3_w1 used_w2_w1)

* Standardize left firm variable
egen 	left_firm_2021 = std(left_firm_raw_2021)
replace left_firm_2021 = 0 if missing(left_firm_raw_2021)
gen 	missing_left_firm_2021 = missing(left_firm_raw_2021)

* Treatment assignment
ren 	assignment ITT

* ----------------------------------------------
* Construct analysis sample 
* ----------------------------------------------
gen 			emp_w1_temp = 1 if employer==1 & wave ==1
bysort 			id_indiv: egen emp_w1 = max(emp_w1_temp)

gen 			int_w1t 	= 1 if wave == 1
bys				id_indiv: 	egen int_w1 = max(int_w1t)
drop 			int_w1t
sort 			id_indiv wave
replace 		employer = employer[_n-1] if wave == 2 & status == 3 & id_indiv == id_indiv[_n-1] & int_w1 == 1
replace 		employer = employer[_n-1] if wave == 3 & status == 3 & id_indiv == id_indiv[_n-1] 

*Wave 2
gen 	left_firm_w2 = 1 if left_firm22 == 1 & wave == 1
replace left_firm_w2 = 0 if used_w2_w1 == 1 & wave == 1
bys 	id_indiv: egen temp = max(left_firm_w2)

replace left_firm_w2 = 0 if wave == 2 & status == 1 & employer == 0 & consent_individual == 1 & missing(temp)

gen 	temp2 = 1 if used_w3_w1 == 1 & wave == 3 & consent_individual == 1 & employer == 0 & ///
	status == 4 & used_w2_w1 == 0
replace left_firm_w2 = 0 if temp2 == 1
drop 	temp temp2

*Wave 3
	
gen 	left_firm_w3 = 1 if status == 3 & wave == 3 & employer==0
replace left_firm_w3 = 0 if status == 1 & wave == 3 & employer==0
replace left_firm_w3 = 0 if status == 4 & wave == 3 & employer==0
replace left_firm_w3 = . if wave == 3 & firm_att==1 & employer==0
replace left_firm_w3 = . if wave == 3 & inlist(id_indiv, 260175, 330633, 601383, 612011, 640326, 640328, 699933, 705895)

* ----------------------------------------------
* Outcome Variables: Quit and Layoff
* ----------------------------------------------

* ----------------------------------------------
* Quit Variables
* ----------------------------------------------

* Wave 2
gen     quit_w2_flag = 1 if left == 4 & wave == 2
bys     id_indiv: egen quit_w2_max = max(quit_w2_flag)
gen     quit_w2 = 1 if quit_w2_max == 1 & left_firm_w2 == 1
replace quit_w2 = 0 if left_firm_w2 == 0
replace quit_w2 =. if wave !=1
drop    quit_w2_flag quit_w2_max

* Wave 3
gen     quit_w3 = 1 if left == 4 & wave == 3
replace quit_w3 = 0 if left_firm_w3 == 0 & wave == 3
replace quit_w3 =. if wave !=3

* ----------------------------------------------
* Layoff Variables
* ----------------------------------------------

* Wave 2
gen     layoff_w2_flag = 1 if inlist(left, 1, 2) & wave == 2
bys     id_indiv: egen layoff_w2_max = max(layoff_w2_flag)
gen     layoff_w2 = 1 if layoff_w2_max == 1 & left_firm_w2 == 1
replace layoff_w2 = 0 if left_firm_w2 == 0
replace	layoff_w2 =. if wave !=1
drop    layoff_w2_flag layoff_w2_max

* Wave 3
gen     layoff_w3 = 1 if inlist(left, 1, 2) & wave == 3
replace layoff_w3 = 0 if left_firm_w3 == 0 & wave == 3
replace	layoff_w3 =. if wave !=3

global 	outcomes left_firm_w2 left_firm_w3 quit_w2 quit_w3 layoff_w2 layoff_w3 
	
	local c=1
	foreach o of global outcomes {
		
		
	* Repeat ITT estimation to recover coefficients & p-values 
	areg `o' ITT educ_tert_base i.ag_ano left_firm_2021 missing_left_firm_2021 $se_indiv 
	
	// Save coefficients in matrix 
	matrix define main_b`c'=(r(table)[1,1])
	matrix rownames main_b`c'=`o'
	
	// save p-values in matrix 
	matrix define main_p`c' = (r(table)[4,1])
	matrix rownames main_p`c'=`o'
		
	local ++c 
			
	}
	



	
*** Matrix with p-values 
clear 

matrix define b_main=(main_b1 \  main_b2 \ main_b3   )
matrix define p_main= (main_p1 \  main_p2 \ main_p3  ) 

matrix list b_main 
matrix list p_main // Note: in case of changes in list of outcome variables, check table labels 


*** Export p-values 
svmat float p_main

rename p_main1 pval 
	cd "$userpath\1_datasets"
save retention_pvals.dta, replace 

gen line=_n 
gen outcomevar=""

replace line = 9 if line == 1
replace line = 10 if line == 2
replace line = 11 if line == 3
replace outcomevar="Left Firm (0/1)"			 			if line==9 
replace outcomevar="Quit (0/1)"			 					if line==10 
replace outcomevar="Layoff (0/1)"			 				if line==11


svmat float b_main 
rename b_main b 
order line outcomevar b pval 
 
	cd "$userpath\1_datasets"
save retention_pvals_labels.dta, replace 
	
	

	use main_pvals_labels.dta, clear
	append using retention_pvals_labels.dta 
	*append using emp_pvals_labels.dta 
	save pvals_labels.dta, replace
	
	erase "main_pvals_labels.dta" 
	erase "retention_pvals_labels.dta"	
	*erase "emp_pvals_labels.dta"
	
**# Bookmark #3: Calculate q-values 


* This code generates BKY (2006) sharpened two-stage q-values as described in Anderson (2008), "Multiple Inference and Gender Differences in the Effects of Early Intervention: A Reevaluation of the Abecedarian, Perry Preschool, and Early Training Projects", Journal of the American Statistical Association, 103(484), 1481-1495

* BKY (2006) sharpened two-stage q-values are introduced in Benjamini, Krieger, and Yekutieli (2006), "Adaptive Linear Step-up Procedures that Control the False Discovery Rate", Biometrika, 93(3), 491-507

* Last modified: M. Anderson, 11/20/07
* Test Platform: Stata/MP 10.0 for Macintosh (Intel 32-bit), Mac OS X 10.5.1
* Should be compatible with Stata 10 or greater on all platforms
* Likely compatible with with Stata 9 or earlier on all platforms (remove "version 10" line below)

clear  
version 17

****  INSTRUCTIONS:
****    Please start with a clear data set
****	When prompted, paste the vector of p-values you are testing into the "pval" variable
****	Please use the "do" button rather than the "run" button to run this file (if you use "run", you will miss the instructions at the prompts)

pause on
set more off


display "***********************************"
display "Please paste the vector of p-values that you wish to test into the variable 'pval'"
display	"After pasting, type 'q' to resume"
display "***********************************"

pause

	cd "$userpath\1_datasets"
use main_pvals.dta, clear 
append using retention_pvals.dta 

* Collect the total number of p-values tested

quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Set the initial counter to 1 

local qval = 1

* Generate the variable that will contain the BKY (2006) sharpened q-values

gen bky06_qval = 1 if pval~=.

* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, then checks which hypotheses are rejected at q = 0.999, then checks which hypotheses are rejected at q = 0.998, etc.  The loop ends by checking which hypotheses are rejected at q = 0.001.


while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	* Generate value q'*r/M
	gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q'*r/M
	gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	gen reject_rank1 = reject_temp1*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	* Generate value q_2st*r/M
	gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	gen reject_rank2 = reject_temp2*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}
	

	quietly sort original_sorting_order
	pause off
	set more on

	display "Code has completed."
	display "Benjamini Krieger Yekutieli (2006) sharpened q-vals are in variable 'bky06_qval'"
	display	"Sorting order is the same as the original vector of p-values"

	gen line=_n
	list line pval bky06_qval
	
	cd "$userpath\1_datasets"
	merge 1:1 line using pvals_labels.dta 

	list line outcomevar pval bky06_qval
	

	cd "$userpath\1_datasets"
	drop _merge 
	save pvals_all.dta, replace 

	
	
	
	
* Note: Sharpened FDR q-vals can be LESS than unadjusted p-vals when many hypotheses are rejected, because if you have many true rejections, then you can tolerate several false rejections too (this effectively just happens for p-vals that are so large that you are not going to reject them regardless).

	cd "$userpath\1_datasets"

	use pvals_all.dta, clear 
	order line outcomevar b pval bky06_qval
	drop original_sorting_order rank
	format b %9.3f
format pval %9.3f
format bky06_qval %9.3f
	cd "$userpath_writing/Tables"

	texsave  using "sharpened_q_values_18M.tex",  replace
	
	texsave line outcomevar b pval bky06_qval using "sharpened_q_values_18M.tex", ///
    title("Table B1: Sharpened q-values") ///
    headerlines("Estimate \#" "Outcome Variable" "Coefficient" "P-value" "Sharpened q-value") ///
    footnote("Note: Sharpened two-stage q-values are calculated as described in Anderson (2008) and introduced in Benjamini, Krieger, and Yekutieli (2006).") ///
    replace

	
	
	
	
	
	
	
	
****
* Pooled Sample
****
	
**# Bookmark #: Main Outcomes
	
*** Load dataset
// First, re-run relevant regressions (we will use pooled sample)

	cd "$userpath\1_datasets"
	use 4_individual_ano_reg_3.dta, clear
		
	
	
	
**# Bookmark #1: Export relevant values (coefficients, p-values ) 
	global outcomes    aminwage wrcon  sosec empquality_ind lwage satisfied hoursworked training_any 
	di "Outcomes: $outcomes"
	
	local c=1
	foreach o of global outcomes {
		
		
	* Repeat ITT estimation to recover coefficients & p-values 
	areg `o' assignment `o'_2021 $controls_balance missing_`o'_2021 i.wave  $se_indiv
	
	// Save coefficients in matrix 
	matrix define main_b`c'=(r(table)[1,1])
	matrix rownames main_b`c'=`o'
	
	// save p-values in matrix 
	matrix define main_p`c' = (r(table)[4,1])
	matrix rownames main_p`c'=`o'
		
	local ++c 
			
	}
	



	
*** Matrix with p-values 
clear 

matrix define b_main=(main_b1 \  main_b2 \ main_b3 \ main_b4 \ main_b5 \ main_b6 \ main_b7 \ main_b8)
matrix define p_main= (main_p1 \ main_p2 \ main_p3 \ main_p4 \ main_p5 \ main_p6 \ main_p7  \ main_p8) 

matrix list b_main 
matrix list p_main // Note: in case of changes in list of outcome variables, check table labels 


*** Export p-values 
svmat float p_main

rename p_main1 pval 
	cd "$userpath\1_datasets"
save main_pvals.dta, replace 

gen line=_n 
gen outcomevar=""

di  "empquality sh_aminwage sh_wrcon sh_sosec"

replace outcomevar="Min. Wage (0/1)" 				if line==1 
replace outcomevar="Written Contract (0/1)" 					if line==2
replace outcomevar="Social Security (0/1)" 					if line==3
replace outcomevar="Formality Index (0-1)" 			if line==4 
replace outcomevar="Wage (Log)" 							if line==5
replace outcomevar="Satisfied (0/1)" 						if line==6
replace outcomevar="Hours worked" 					if line==7
replace outcomevar="Training Participation (0/1)"			if line==8

svmat float b_main 
rename b_main b 
order line outcomevar b pval 
 
	cd "$userpath\1_datasets"
save main_pvals_labels.dta, replace 

	
	
	use main_pvals_labels.dta, clear

	save pvals_labels.dta, replace
	
	erase "main_pvals_labels.dta" 

	
**# Bookmark #3: Calculate q-values 


* This code generates BKY (2006) sharpened two-stage q-values as described in Anderson (2008), "Multiple Inference and Gender Differences in the Effects of Early Intervention: A Reevaluation of the Abecedarian, Perry Preschool, and Early Training Projects", Journal of the American Statistical Association, 103(484), 1481-1495

* BKY (2006) sharpened two-stage q-values are introduced in Benjamini, Krieger, and Yekutieli (2006), "Adaptive Linear Step-up Procedures that Control the False Discovery Rate", Biometrika, 93(3), 491-507

* Last modified: M. Anderson, 11/20/07
* Test Platform: Stata/MP 10.0 for Macintosh (Intel 32-bit), Mac OS X 10.5.1
* Should be compatible with Stata 10 or greater on all platforms
* Likely compatible with with Stata 9 or earlier on all platforms (remove "version 10" line below)

clear  
version 17

****  INSTRUCTIONS:
****    Please start with a clear data set
****	When prompted, paste the vector of p-values you are testing into the "pval" variable
****	Please use the "do" button rather than the "run" button to run this file (if you use "run", you will miss the instructions at the prompts)

pause on
set more off

/*
if _N>0 {
	display "Please clear data set before proceeding"
	display "After clearing, type 'q' to resume"
	pause
	}	
*/
*quietly gen float pval = .

display "***********************************"
display "Please paste the vector of p-values that you wish to test into the variable 'pval'"
display	"After pasting, type 'q' to resume"
display "***********************************"

pause

	cd "$userpath\1_datasets"
use main_pvals.dta, clear 

* Collect the total number of p-values tested

quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Set the initial counter to 1 

local qval = 1

* Generate the variable that will contain the BKY (2006) sharpened q-values

gen bky06_qval = 1 if pval~=.

* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, then checks which hypotheses are rejected at q = 0.999, then checks which hypotheses are rejected at q = 0.998, etc.  The loop ends by checking which hypotheses are rejected at q = 0.001.


while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	* Generate value q'*r/M
	gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q'*r/M
	gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	gen reject_rank1 = reject_temp1*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	* Generate value q_2st*r/M
	gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	gen reject_rank2 = reject_temp2*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}
	

	quietly sort original_sorting_order
	pause off
	set more on

	display "Code has completed."
	display "Benjamini Krieger Yekutieli (2006) sharpened q-vals are in variable 'bky06_qval'"
	display	"Sorting order is the same as the original vector of p-values"

	gen line=_n
	list line pval bky06_qval
	
	cd "$userpath\1_datasets"
	merge 1:1 line using pvals_labels.dta 

	list line outcomevar pval bky06_qval
	

	cd "$userpath\1_datasets"
	drop _merge 
	save pvals_all.dta, replace 

	
	
	
	
* Note: Sharpened FDR q-vals can be LESS than unadjusted p-vals when many hypotheses are rejected, because if you have many true rejections, then you can tolerate several false rejections too (this effectively just happens for p-vals that are so large that you are not going to reject them regardless).

	cd "$userpath\1_datasets"

	use pvals_all.dta, clear 
	order line outcomevar b pval bky06_qval
	drop original_sorting_order rank
	format b %9.3f
format pval %9.3f
format bky06_qval %9.3f
	cd "$userpath_writing/Tables"

	texsave  using "sharpened_q_values_pooled.tex",  replace
	
	texsave line outcomevar b pval bky06_qval using "sharpened_q_values_pooled.tex", ///
    title("Table B1: Sharpened q-values") ///
    headerlines("Estimate \#" "Outcome Variable" "Coefficient" "P-value" "Sharpened q-value") ///
    footnote("Note: Sharpened two-stage q-values are calculated as described in Anderson (2008) and introduced in Benjamini, Krieger, and Yekutieli (2006).") ///
    replace

	
	
	
	
	